Daniele Moro and Maro
import pandas as pd
import numpy as np
import pandas as pd
import glob
import string
from string import digits
import matplotlib.pyplot as plt
%matplotlib inline
import re
import seaborn as sns
import sklearn
from nltk.corpus import stopwords
import collections as c
import nltk
import matplotlib.pyplot as plt
from collections import Counter
import nltk
from nltk.translate.bleu_score import corpus_bleu
import os
from keras.applications.vgg19 import VGG19
from keras.preprocessing import image
from keras.applications.vgg19 import preprocess_input
from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model
from keras.utils import plot_model
from keras.layers.merge import concatenate
import IPython
from tqdm import tqdm_notebook as tqdm
import requests
stops = stop_words = set(stopwords.words('english'))
try:
from sklearn.model_selection import train_test_split
except ModuleNotFoundError:
from sklearn.cross_validation import train_test_split
# the vgg 19 model is used for getting the image features
vgg19_model = VGG19(weights='imagenet')
vgg19_model = Model(inputs=vgg19_model.input, outputs=vgg19_model.get_layer('predictions').output)
Many of these functions were inspired from the following tutorial: https://medium.com/@dev.elect.iitd/neural-machine-translation-using-word-level-seq2seq-model-47538cba8cd7
def process_data(filename = "./data_with_image_features.json", top_comments=10, max_sent=3, max_words=60):
reddit_data = pd.read_json(filename)
#grab top comments
reddit_data.comments = [x[:top_comments] for x in reddit_data.comments]
# expand the data so that each comment has its own row
lst_col_ga = 'comments'
df = reddit_data
reddit_data = pd.DataFrame({
col:np.repeat(df[col].values, df[lst_col_ga].str.len())
for col in df.columns.drop(lst_col_ga)}
).assign(**{lst_col_ga:np.concatenate(df[lst_col_ga].values)})[df.columns].dropna().reset_index(drop = True)
# select relevant columns
reddit_data = reddit_data[["title", "comments", "image_feature", "url"]]
# clean whitespace
reddit_data.comments = [x.replace("\n", ' ').replace("\t", ' ') for x in reddit_data.comments]
reddit_data.title = [x.replace("\n", ' ').replace("\t", ' ') for x in reddit_data.title]
# lower the words
reddit_data.title = reddit_data.title.apply(lambda x: x.lower())
reddit_data.comments = reddit_data.comments.apply(lambda x: x.lower())
# get only the top sentences
reddit_data.title = [". ".join(nltk.sent_tokenize(x)[:max_sent]) for x in reddit_data.title]
reddit_data.comments = [". ".join(nltk.sent_tokenize(x)[:max_sent]) for x in reddit_data.comments]
# if the sentence is too long, cut it off
reddit_data.comments = [" ".join((nltk.word_tokenize(x))[:max_words]) for x in reddit_data.comments]
# remove punctuation
reddit_data.title = reddit_data.title.apply(lambda x: ''.join(ch for ch in x if ch not in set(string.punctuation)))
reddit_data.comments = reddit_data.comments.apply(lambda x: ''.join(ch for ch in x if ch not in set(string.punctuation)))
# remove numbers
remove_numbers = str.maketrans('', '', digits)
reddit_data.title = reddit_data.title.apply(lambda x: x.translate(remove_numbers))
reddit_data.comments = reddit_data.comments.apply(lambda x: x.translate(remove_numbers))
# add start and end tokens
reddit_data.comments = reddit_data.comments.apply(lambda x : 'START_ '+ x + ' _END')
return reddit_data
def build_tokenizers(reddit, vocab_size=1000):
# build a vocabulary
title_vocab = [nltk.word_tokenize(x) for x in reddit.title]
title_vocab = [i for k in title_vocab for i in k]
title_vocab = [x[0] for x in Counter(title_vocab).most_common()[:vocab_size]]
comments_vocab = [nltk.word_tokenize(x) for x in reddit.comments]
comments_vocab = [i for k in comments_vocab for i in k]
comments_vocab = [x[0] for x in Counter(comments_vocab).most_common()[:vocab_size]]
# tokenize
input_words = sorted(list(title_vocab))
target_words = sorted(list(comments_vocab))
input_tokenizer = {word: i for i, word in enumerate(input_words)}
target_tokenizer = {word: i for i, word in enumerate(target_words)}
return input_tokenizer, target_tokenizer
def get_glove_embeddings(glove_dir):
embeddings_index = {}
with open(os.path.join(glove_dir, 'glove.6B.300d.txt'), encoding='utf-8') as f:
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
print('Found {} word vectors'.format(len(embeddings_index)))
return embeddings_index
def build_embedding_matrix(tokenizer, embeddings, embedding_dim = 300):
embedding_matrix = np.zeros((len(tokenizer), embedding_dim))
for word, i in [i for i in tokenizer.items()]:
embedding_vector = embeddings.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
return embedding_matrix
def get_input_target_data(reddit, input_tokenizer, target_tokenizer):
max_comment_length = np.max([len(l.split(' ')) for l in reddit.comments])
max_title_length = np.max([len(l.split(' ')) for l in reddit.title])
encoder_input_data = np.zeros((len(reddit.title), max_title_length), dtype='float32')
decoder_input_data = np.zeros((len(reddit.comments), max_comment_length), dtype='float32')
decoder_target_data = np.zeros((len(reddit.comments), max_comment_length, len(target_tokenizer)),dtype='float32')
for i, (input_text, target_text) in enumerate(zip(reddit.title, reddit.comments)):
for t, word in enumerate(input_text.split()):
if word in input_tokenizer:
encoder_input_data[i, t] = input_tokenizer[word]
for t, word in enumerate(target_text.split()):
if word in target_tokenizer:
decoder_input_data[i, t] = target_tokenizer[word]
if t > 0:
# decoder_target_data will be ahead by one timestep
decoder_target_data[i, t - 1, target_tokenizer[word]] = 1.
return encoder_input_data, decoder_input_data, decoder_target_data
def get_image_features(reddit):
max_comment_length = np.max([len(l.split(' ')) for l in reddit.comments])
image_features = np.array([np.array(i) for i in reddit.image_feature]).reshape(-1, 1, 1000)
image_features = np.broadcast_to(image_features, (image_features.shape[0], max_comment_length,image_features.shape[2]))
return image_features
def define_model(input_tokenizer, target_tokenizer, embedding_matrix_input, embedding_matrix_target):
# define the encoder
embedding_dim_input = embedding_matrix_input.shape[1]
encoder_inputs = Input(shape=(None,))
en_x = Embedding(len(input_tokenizer), embedding_dim_input, weights=[embedding_matrix_input], trainable = False)(encoder_inputs)
encoder = LSTM(50, return_state=True)
encoder_outputs, state_h, state_c = encoder(en_x) #discard output and keep states
encoder_states = [state_h, state_c]
# define the decoder
embedding_dim_output = embedding_matrix_input.shape[1]
num_decoder_tokens = len(target_tokenizer)
decoder_inputs = Input(shape=(None,))
dex = Embedding(num_decoder_tokens, embedding_dim_output, weights=[embedding_matrix_target], trainable = False)
final_dex = dex(decoder_inputs)
input_image = Input(shape=(None, 1000))
y = input_image
y = Model(inputs=input_image, outputs=y)
combined = concatenate([final_dex, y.output])
decoder_lstm = LSTM(50, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(combined, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)
model = Model([encoder_inputs, y.input, decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])
# compile the encoder and decoder models for prediction
encoder_model = Model(encoder_inputs, encoder_states)
encoder_model.summary()
decoder_state_input_h = Input(shape=(50,))
decoder_state_input_c = Input(shape=(50,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
final_dex2= dex(decoder_inputs)
input_image2 = Input(shape=(None, 1000))
y2 = input_image2
y2 = Model(inputs=input_image2, outputs=y2)
combined = concatenate([final_dex2, y.output])
decoder_outputs2, state_h2, state_c2 = decoder_lstm(combined, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)
decoder_model = Model(
[decoder_inputs, y.input] + decoder_states_inputs,
[decoder_outputs2] + decoder_states2)
return model, encoder_model, decoder_model
def decode_sequence(input_seq, input_img, encoder_model, decoder_model, input_tokenizer, target_tokenizer):
reverse_input_char_index = dict( (i, char) for char, i in input_tokenizer.items())
reverse_target_char_index = dict((i, char) for char, i in target_tokenizer.items())
# Encode the input as state vectors.
states_value = encoder_model.predict(input_seq)
# Generate empty target sequence of length 1.
target_seq = np.zeros((1,1))
# Populate the first character of target sequence with the start character.
target_seq[0, 0] = target_tokenizer['START_']
# Sampling loop for a batch of sequences
# (to simplify, here we assume a batch of size 1).
stop_condition = False
decoded_sentence = []
while not stop_condition:
output_tokens, h, c = decoder_model.predict(
[target_seq, input_img] + states_value)
# Sample a token
sampled_token_index = np.argmax(output_tokens[0, -1, :])
sampled_char = reverse_target_char_index[sampled_token_index]
decoded_sentence.append(sampled_char)
# Exit condition: either hit max length
# or find stop character.
if (sampled_char == '_END' or
len(decoded_sentence) > 100):
stop_condition = True
# Update the target sequence (of length 1).
target_seq = np.zeros((1, 1))
target_seq[0, 0] = sampled_token_index
# Update states
states_value = [h, c]
return decoded_sentence
data_path = "./data_with_image_features.json"
reddit = process_data(filename = data_path, top_comments=10, max_sent=3, max_words=60)
reddit[0:10]
titles_all_words = [nltk.word_tokenize(x) for x in reddit.title]
titles_all_words = [x for i in titles_all_words for x in i]
titles_all_words = [x for x in titles_all_words if x not in stops]
titles_all_words_freq = c.Counter(titles_all_words).most_common(13)
titles_all_words_freq.pop(0)
titles_all_words_freq.pop(0)
titles_all_words_freq.pop(0)
titles_all_words_freq.pop(1)
titles_all_words_freq_df = pd.DataFrame(columns=["words", "counts"], data = titles_all_words_freq)
titles_all_words_freq_df["freq"] = titles_all_words_freq_df.counts / sum(titles_all_words_freq_df.counts)
sns.set(rc={'figure.figsize': (5, 5)})
sns.set(style="whitegrid")
ax = sns.barplot(x = 'freq', y = 'words', data = titles_all_words_freq_df, palette="Blues_d")
ax.set_xlabel('Frequency')
ax.set_xlabel('Frequency')
ax.set_ylabel('Words')
plt.title("Top ten words in the title")
plt.show()
comments_all_words = [nltk.word_tokenize(x) for x in reddit.comments]
comments_all_words = [x for i in comments_all_words for x in i]
comments_all_words = [x for x in comments_all_words if x not in stops]
comments_all_words_freq = c.Counter(comments_all_words).most_common(13)
comments_all_words_freq.pop(0)
comments_all_words_freq.pop(0)
comments_all_words_freq.pop(0)
comments_all_words_freq.pop(1)
comments_all_words_freq_df = pd.DataFrame(columns=["words", "counts"], data = comments_all_words_freq)
comments_all_words_freq_df["freq"] = comments_all_words_freq_df.counts / sum(comments_all_words_freq_df.counts)
sns.set(rc={'figure.figsize': (5, 5)})
sns.set(style="whitegrid")
ax = sns.barplot(x = 'freq', y = 'words', data = comments_all_words_freq_df, palette="Blues_d")
ax.set_xlabel('Frequency')
ax.set_xlabel('Frequency')
ax.set_ylabel('Words')
plt.title("Top ten words in the titles")
plt.show()
lenght_list = []
for l in reddit.comments:
lenght_list.append(len(l.split(' ')))
np.mean(lenght_list), np.max(lenght_list)
plt.hist(lenght_list, bins = 50)
plt.ylabel("Frequency\n")
plt.xlabel("\nNumber of words")
plt.title("Distribution of number of words in comments")
plt.show()
lenght_list = []
for l in reddit.title:
lenght_list.append(len(l.split(' ')))
np.mean(lenght_list), np.max(lenght_list)
plt.hist(lenght_list, bins = 50)
plt.ylabel("Frequency\n")
plt.xlabel("\nNumber of words")
plt.title("Distribution of number of words in titles")
plt.show()
# build tokenizers and glove embeddings
glove_root = "./"
input_tokenizer, target_tokenizer = build_tokenizers(reddit)
embeddings = get_glove_embeddings(glove_root)
# get embedding matrices
input_embedding_matrix = build_embedding_matrix(input_tokenizer, embeddings)
output_embedding_matrix = build_embedding_matrix(input_tokenizer, embeddings)
#split the data
reddit_train, reddit_test = train_test_split(reddit, test_size=0.1, random_state=42)
reddit_train = pd.DataFrame(reddit_train, columns = reddit.columns).reset_index(drop = True)
reddit_test = pd.DataFrame(reddit_test, columns = reddit.columns).reset_index(drop = True)
# get the raw data
encoder_input_data_tr, decoder_input_data_tr, decoder_target_data_tr = get_input_target_data(reddit_train,
input_tokenizer,
target_tokenizer)
encoder_input_data_te, decoder_input_data_te, decoder_target_data_te = get_input_target_data(reddit_test,
input_tokenizer,
target_tokenizer)
image_features_tr = get_image_features(reddit_train)
image_features_te = get_image_features(reddit_test)
# define the model
model, encoder_model, decoder_model = define_model(input_tokenizer, target_tokenizer,
input_embedding_matrix, output_embedding_matrix)
display(model.summary())
# train
model_name = "newmodel.h5"
model.fit([encoder_input_data_tr, image_features_tr, decoder_input_data_tr], decoder_target_data_tr,
validation_data=([encoder_input_data_te, image_features_te, decoder_input_data_te], decoder_target_data_te),
batch_size = 64,
epochs = 100)
model.save_weights(model_name)
print("Saved model {} to disk".format(model_name))
# source: https://keras.io/visualization/
history = model.history
print(history.history.keys())
# summarize history for accuracy
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
def evaluate_model(raw_dataset, decoder_input_data, encoder_input_data, start=0, end=None, print_examples=False):
if end is None:
end = len(encoder_input_data)
inv_target_tokenizer = {v: k for k, v in target_tokenizer.items()}
inv_input_tokenizer = {v: k for k, v in input_tokenizer.items()}
actual, predicted = list(), list()
for seq_index in tqdm(range(start, end)):
act = [inv_target_tokenizer[i] for i in decoder_input_data_te[seq_index] if inv_target_tokenizer[i] != "START_"]
actual.append(act)
input_seq = encoder_input_data[seq_index: seq_index + 1]
input_img = np.array(reddit.image_feature[seq_index]).reshape(1,1,1000)
pred = decode_sequence(input_seq, input_img, encoder_model, decoder_model, input_tokenizer, target_tokenizer)
predicted.append(pred)
url = raw_dataset.url[seq_index]
if print_examples and exists_image_url(url):
#print("seq_index: [{}]".format(seq_index))
display(IPython.display.Image(url, width = 250))
title_text = [inv_input_tokenizer[i] for i in encoder_input_data_te[seq_index] if i != 0]
print("INPUT:\n{}\n".format(" ".join(title_text)))
print("PREDICTED:\n{}\n".format(" ".join(pred)))
print("TARGET (real comment):\n{}".format(" ".join(act)))
print("----------")
return corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0))
def exists_image_url(path):
return requests.head(path).status_code == requests.codes.ok
def get_one_img(img_path):
img = image.load_img(img_path, target_size=(224, 224))
x = image.img_to_array(img)
x = np.expand_dims(x, axis=0)
x = preprocess_input(x)
prediction_features = vgg19_model.predict(x)
return prediction_features.ravel()
def user_input(input_words, img_feats, seq_len):
input_seq = np.array([input_tokenizer[w] if w in input_tokenizer else 0 for w in input_words.split()] +
[0] * (seq_len - len(input_words.split()))).reshape(1, seq_len)
input_img = np.array(img_feats.reshape(1,1,1000))
inv_input_token_index = {v: k for k, v in input_tokenizer.items()}
#print("raw input: [{}] {}".format(" ".join([inv_input_token_index[i] for i in input_seq[0]]), input_seq.shape))
decoded_sentence = decode_sequence(input_seq, input_img, encoder_model, decoder_model, input_tokenizer, target_tokenizer)
return " ".join(decoded_sentence).replace("START_ ", "").replace(" _END","")
model.load_weights("model.h5")
evaluate_model(reddit_test, encoder_input_data_te, decoder_input_data_te, print_examples=False)
text = "Today is my birthday"
img = get_one_img("../images/1.jpg")
user_input(text, img, encoder_input_data_tr.shape[1])
evaluate_model(reddit_test, encoder_input_data_te, decoder_input_data_te, start=100, end=120, print_examples=True)